1 + 2 + 3[1] 6
1 + 2 * 3[1] 7
(1 + 2) * 3[1] 9
c(0,1,2,3,5,8)[1] 0 1 2 3 5 8
1:11 [1] 1 2 3 4 5 6 7 8 9 10 11
1:11 + 3 [1] 4 5 6 7 8 9 10 11 12 13 14
x <- 3
y <- 1:10
z <- c(1,20,3,6)x[1] 3
y [1] 1 2 3 4 5 6 7 8 9 10
z[1] 1 20 3 6
x + y [1] 4 5 6 7 8 9 10 11 12 13
x * z[1] 3 60 9 18
?mean
help(mean)my.character <- c('Emma','Olivia','Sophia')
my.character[1] "Emma" "Olivia" "Sophia"
my.dates <- c(as.Date("2012-04-05"), Sys.Date())
my.dates[1] "2012-04-05" "2016-10-25"
my.numeric <- c(12,62.3,7,101)
my.numeric[1] 12.0 62.3 7.0 101.0
my.factor <- factor(c(0,0,0,1,1), labels=c('Female','Male'))
my.factor[1] Female Female Female Male Male
Levels: Female Male
my.logical <- c(TRUE,FALSE,FALSE)
my.logical[1] TRUE FALSE FALSE
weight.kg <- c(60, 72, 57, 90, 95, 72)
height.meters <- c(1.75, 1.80, 1.65, 1.90, 1.74, 1.91)
bmi <- weight.kg/height.meters^2
bmi[1] 19.59184 22.22222 20.93664 24.93075 31.37799 19.73630
byear <- c(68,69,71,72)
byear[1] 68 69 71 72
Boys <- data.frame(kid = c('JHC','CMC','REC','WBC'),
byear = c(68,69,70,71))
Boys kid byear
1 JHC 68
2 CMC 69
3 REC 70
4 WBC 71
names(Boys)[1] "kid" "byear"
str(Boys)'data.frame': 4 obs. of 2 variables:
$ kid : Factor w/ 4 levels "CMC","JHC","REC",..: 2 1 3 4
$ byear: num 68 69 70 71
Boys$kid[1] JHC CMC REC WBC
Levels: CMC JHC REC WBC
with(Boys, mean(byear))[1] 69.5
ls() [1] "bmi" "Boys" "byear" "height.meters"
[5] "my.character" "my.dates" "my.factor" "my.logical"
[9] "my.numeric" "weight.kg" "x" "y"
[13] "z"
library(foreign)ls("package:foreign") [1] "data.restore" "lookup.xport" "read.arff" "read.dbf"
[5] "read.dta" "read.epiinfo" "read.mtp" "read.octave"
[9] "read.S" "read.spss" "read.ssd" "read.systat"
[13] "read.xport" "write.arff" "write.dbf" "write.dta"
[17] "write.foreign"
mytable <- read.table("c:/chuck/NYU/Stat Group/table1.txt", sep=",", header=TRUE)
mytable id female inc80 inc81 inc82
1 1 0 5000 5500 6000
2 2 1 2000 2200 3300
3 3 0 3000 2000 1000
str(mytable)'data.frame': 3 obs. of 5 variables:
$ id : int 1 2 3
$ female: int 0 1 0
$ inc80 : int 5000 2000 3000
$ inc81 : int 5500 2200 2000
$ inc82 : int 6000 3300 1000
library(foreign)
titanic <- read.spss("c:/chuck/NYU/Stat Group/titanic.sav", to.data.frame = TRUE)
head(titanic) pclass survived age fare gender
1 First Yes 29.0000 211.3375 Female
2 First Yes 0.9167 151.5500 Male
3 First No 2.0000 151.5500 Female
4 First No 30.0000 151.5500 Male
5 First No 25.0000 151.5500 Female
6 First Yes 48.0000 26.5500 Male
attributes(titanic)$variable.labels pclass survived age
"Passenger Class" "Survived" "Passenger Age"
fare gender
"Passenger Fare" "Passenger Gender"
library(openxlsx)
my.files <- list.files("c:/chuck/NYU/Stat Group/R Workshop/Many Files/", pattern = ".xlsx$", full.names=TRUE, recursive=TRUE)
my.files # vector of the file names with full path[1] "c:/chuck/NYU/Stat Group/R Workshop/Many Files/File1.xlsx"
[2] "c:/chuck/NYU/Stat Group/R Workshop/Many Files/File2.xlsx"
[3] "c:/chuck/NYU/Stat Group/R Workshop/Many Files/File3.xlsx"
[4] "c:/chuck/NYU/Stat Group/R Workshop/Many Files/File4.xlsx"
[5] "c:/chuck/NYU/Stat Group/R Workshop/Many Files/File5.xlsx"
[6] "c:/chuck/NYU/Stat Group/R Workshop/Many Files/File6.xlsx"
[7] "c:/chuck/NYU/Stat Group/R Workshop/Many Files/File7.xlsx"
[8] "c:/chuck/NYU/Stat Group/R Workshop/Many Files/File8.xlsx"
my.sheets <- lapply(my.files, function(x){getSheetNames(x)}) # list of worksheet names within each MS-Excel workbook
unlist(my.sheets)[1] "Sheet 1" "Sheet 1" "Sheet 1" "Sheet 1" "Sheet 1" "Sheet 1" "Sheet 1"
[8] "Sheet 1"
my.sheets.DF <- data.frame(file = rep(my.files, unlist(lapply(my.sheets, length))), sheet = unlist(my.sheets))
my.sheets.DF file sheet
1 c:/chuck/NYU/Stat Group/R Workshop/Many Files/File1.xlsx Sheet 1
2 c:/chuck/NYU/Stat Group/R Workshop/Many Files/File2.xlsx Sheet 1
3 c:/chuck/NYU/Stat Group/R Workshop/Many Files/File3.xlsx Sheet 1
4 c:/chuck/NYU/Stat Group/R Workshop/Many Files/File4.xlsx Sheet 1
5 c:/chuck/NYU/Stat Group/R Workshop/Many Files/File5.xlsx Sheet 1
6 c:/chuck/NYU/Stat Group/R Workshop/Many Files/File6.xlsx Sheet 1
7 c:/chuck/NYU/Stat Group/R Workshop/Many Files/File7.xlsx Sheet 1
8 c:/chuck/NYU/Stat Group/R Workshop/Many Files/File8.xlsx Sheet 1
my.list <- vector(mode="list", length = nrow(my.sheets.DF)) # empty list to hold each data filefor(i in 1:nrow(my.sheets.DF)){
my.list[[i]] <- read.xlsx(xlsxFile = as.character(my.sheets.DF$file)[i], sheet = as.character(my.sheets.DF$sheet)[i])
}my.DF <- as.data.frame(do.call(rbind, my.list))
dim(my.DF) # 80 observations, 2 variables [1] 80 2
library(foreign)
library(car)
auto <- read.spss("c:/chuck/NYU/Stat Group/R Workshop/auto.sav", to.data.frame=TRUE)
summary(auto$MPG)
auto$MPG3 <- car::recode(auto$MPG, "lo:18=1; 19:23=2; 24:hi=3")table(auto$MPG3)
1 2 3
27 24 23
Two data frames with a common ID variable
A <- data.frame(famid = c(2,1,3),
name = c('Art','Bill','Paul'),
inc = c(22,30,25))
B <- data.frame(famid = c(3,1,2),
faminc96 = c(75,40,45),
faminc97 = c(76,40.5,40.4),
faminc98 = c(77,41,45.8))
intersect(names(A), names(B))[1] "famid"
One Merged Data Frame
A famid name inc
1 2 Art 22
2 1 Bill 30
3 3 Paul 25
B famid faminc96 faminc97 faminc98
1 3 75 76.0 77.0
2 1 40 40.5 41.0
3 2 45 40.4 45.8
AB <- merge(A, B)
AB famid name inc faminc96 faminc97 faminc98
1 1 Bill 30 40 40.5 41.0
2 2 Art 22 45 40.4 45.8
3 3 Paul 25 75 76.0 77.0
bank <- read.spss("c:/chuck/NYU/Stat Group/bank.sav", to.data.frame = TRUE)
head(bank) ID SALBEG SEX TIME AGE SALNOW EDLEVEL WORK JOBCAT
1 628 8400 Males 81 28.50 16080 16 0.25 College trainee
2 630 24000 Males 73 40.33 41400 16 12.50 Exempt employee
3 632 10200 Males 83 31.08 21960 15 4.08 Exempt employee
4 633 8700 Males 93 31.17 19200 16 1.83 College trainee
5 635 17400 Males 83 41.92 28350 19 13.00 Exempt employee
6 637 12996 Males 80 29.50 27250 18 2.42 College trainee
MINORITY SEXRACE EGENDER EMINORIT EGENXMIN BSALCENT CSALADJ
1 White White males -1 -1 1 1593.57 13133.489
2 White White males -1 -1 1 17193.57 9609.089
3 White White males -1 -1 1 3393.57 15685.289
4 White White males -1 -1 1 1893.57 15698.789
5 White White males -1 -1 1 10593.57 8762.489
6 White White males -1 -1 1 6189.57 15805.485
SALDELTA RES_1 GENDER
1 7680 -1013.504 M
2 17400 -4543.634 M
3 11760 1537.635 M
4 10500 1551.686 M
5 10950 -5387.809 M
6 14254 1656.804 M
The head() function is a convenient way to see the first few observations of a data object
With the subset() function
subset(bank, SALBEG < 4000 & SEX == "Males", select=c(ID,SEX,SALBEG,AGE,SALNOW)) ID SEX SALBEG AGE SALNOW
414 926 Males 3600 53.50 12300
429 1077 Males 3900 29.17 9000
bank[bank$SALBEG < 4000 & bank$SEX == "Males", c(1,3,2,5,6)] ID SEX SALBEG AGE SALNOW
414 926 Males 3600 53.50 12300
429 1077 Males 3900 29.17 9000
mytable <- read.table("c:/chuck/NYU/Stat Group/table1.txt", sep=",", header=TRUE)
mytable id female inc80 inc81 inc82
1 1 0 5000 5500 6000
2 2 1 2000 2200 3300
3 3 0 3000 2000 1000
library(dplyr)
library(tidyr)
longtab <- gather(mytable, year, inc, inc80:inc82)
longtab id female year inc
1 1 0 inc80 5000
2 2 1 inc80 2000
3 3 0 inc80 3000
4 1 0 inc81 5500
5 2 1 inc81 2200
6 3 0 inc81 2000
7 1 0 inc82 6000
8 2 1 inc82 3300
9 3 0 inc82 1000
longtab id female year inc
1 1 0 inc80 5000
2 2 1 inc80 2000
3 3 0 inc80 3000
4 1 0 inc81 5500
5 2 1 inc81 2200
6 3 0 inc81 2000
7 1 0 inc82 6000
8 2 1 inc82 3300
9 3 0 inc82 1000
widetab <- spread(longtab, year, inc)
widetab id female inc80 inc81 inc82
1 1 0 5000 5500 6000
2 2 1 2000 2200 3300
3 3 0 3000 2000 1000
head(iris) Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
3 4.7 3.2 1.3 0.2 setosa
4 4.6 3.1 1.5 0.2 setosa
5 5.0 3.6 1.4 0.2 setosa
6 5.4 3.9 1.7 0.4 setosa
aggregate(iris[,1:4], list(Species = iris$Species), FUN = mean) Species Sepal.Length Sepal.Width Petal.Length Petal.Width
1 setosa 5.006 3.428 1.462 0.246
2 versicolor 5.936 2.770 4.260 1.326
3 virginica 6.588 2.974 5.552 2.026
x <- c('A','B','C','D')
y <- c('D','A','F','J')
setdiff(x, y) # which elements of x are not in y[1] "B" "C"
setdiff(y, x) # which elements of y are not in x[1] "F" "J"
intersect(x, y) # what elements are in both x and y[1] "A" "D"
union(x, y) # elements in either x or y[1] "A" "B" "C" "D" "F" "J"
"C" %in% x # is there a "C" in x[1] TRUE
y %in% c("F","J") # which elements of y are equal to "F" or "J"[1] FALSE FALSE TRUE TRUE
x <- c('A','B','C','D')
grep('C', x)[1] 3
gsub('C', 'c', x)[1] "A" "B" "c" "D"
paste('Item', 1:10, sep="-") [1] "Item-1" "Item-2" "Item-3" "Item-4" "Item-5" "Item-6" "Item-7"
[8] "Item-8" "Item-9" "Item-10"
x <- c('fruit_apple','instrument_guitar','instrument_piano','fruit_orange','instrument_trumpet')
grepl("fruit", x)[1] TRUE FALSE FALSE TRUE FALSE
library(dplyr)filter()select()mutate()summarize()group_by()arrange()library(hflights)
head(hflights) Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
5424 2011 1 1 6 1400 1500 AA
5425 2011 1 2 7 1401 1501 AA
5426 2011 1 3 1 1352 1502 AA
5427 2011 1 4 2 1403 1513 AA
5428 2011 1 5 3 1405 1507 AA
5429 2011 1 6 4 1359 1503 AA
FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin
5424 428 N576AA 60 40 -10 0 IAH
5425 428 N557AA 60 45 -9 1 IAH
5426 428 N541AA 70 48 -8 -8 IAH
5427 428 N403AA 70 39 3 3 IAH
5428 428 N492AA 62 44 -3 5 IAH
5429 428 N262AA 64 45 -7 -1 IAH
Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted
5424 DFW 224 7 13 0 0
5425 DFW 224 6 9 0 0
5426 DFW 224 5 17 0 0
5427 DFW 224 9 22 0 0
5428 DFW 224 9 9 0 0
5429 DFW 224 6 13 0 0
str(hflights)'data.frame': 227496 obs. of 21 variables:
$ Year : int 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
$ Month : int 1 1 1 1 1 1 1 1 1 1 ...
$ DayofMonth : int 1 2 3 4 5 6 7 8 9 10 ...
$ DayOfWeek : int 6 7 1 2 3 4 5 6 7 1 ...
$ DepTime : int 1400 1401 1352 1403 1405 1359 1359 1355 1443 1443 ...
$ ArrTime : int 1500 1501 1502 1513 1507 1503 1509 1454 1554 1553 ...
$ UniqueCarrier : chr "AA" "AA" "AA" "AA" ...
$ FlightNum : int 428 428 428 428 428 428 428 428 428 428 ...
$ TailNum : chr "N576AA" "N557AA" "N541AA" "N403AA" ...
$ ActualElapsedTime: int 60 60 70 70 62 64 70 59 71 70 ...
$ AirTime : int 40 45 48 39 44 45 43 40 41 45 ...
$ ArrDelay : int -10 -9 -8 3 -3 -7 -1 -16 44 43 ...
$ DepDelay : int 0 1 -8 3 5 -1 -1 -5 43 43 ...
$ Origin : chr "IAH" "IAH" "IAH" "IAH" ...
$ Dest : chr "DFW" "DFW" "DFW" "DFW" ...
$ Distance : int 224 224 224 224 224 224 224 224 224 224 ...
$ TaxiIn : int 7 6 5 9 9 6 12 7 8 6 ...
$ TaxiOut : int 13 9 17 22 9 13 15 12 22 19 ...
$ Cancelled : int 0 0 0 0 0 0 0 0 0 0 ...
$ CancellationCode : chr "" "" "" "" ...
$ Diverted : int 0 0 0 0 0 0 0 0 0 0 ...
head(filter(hflights, Month == 1, UniqueCarrier == "AA")) Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
1 2011 1 1 6 1400 1500 AA 428
2 2011 1 2 7 1401 1501 AA 428
3 2011 1 3 1 1352 1502 AA 428
4 2011 1 4 2 1403 1513 AA 428
5 2011 1 5 3 1405 1507 AA 428
6 2011 1 6 4 1359 1503 AA 428
TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin Dest Distance
1 N576AA 60 40 -10 0 IAH DFW 224
2 N557AA 60 45 -9 1 IAH DFW 224
3 N541AA 70 48 -8 -8 IAH DFW 224
4 N403AA 70 39 3 3 IAH DFW 224
5 N492AA 62 44 -3 5 IAH DFW 224
6 N262AA 64 45 -7 -1 IAH DFW 224
TaxiIn TaxiOut Cancelled CancellationCode Diverted
1 7 13 0 0
2 6 9 0 0
3 5 17 0 0
4 9 22 0 0
5 9 9 0 0
6 6 13 0 0
head(select(hflights, DepTime, ArrTime, AirTime)) DepTime ArrTime AirTime
5424 1400 1500 40
5425 1401 1501 45
5426 1352 1502 48
5427 1403 1513 39
5428 1405 1507 44
5429 1359 1503 45
head(select(hflights, ends_with("Time"))) DepTime ArrTime ActualElapsedTime AirTime
5424 1400 1500 60 40
5425 1401 1501 60 45
5426 1352 1502 70 48
5427 1403 1513 70 39
5428 1405 1507 62 44
5429 1359 1503 64 45
g1 <- mutate(hflights, ActualGroundTime = ActualElapsedTime - AirTime)
head(select(g1, ActualElapsedTime, AirTime, ActualGroundTime)) ActualElapsedTime AirTime ActualGroundTime
1 60 40 20
2 60 45 15
3 70 48 22
4 70 39 31
5 62 44 18
6 64 45 19
summarize(hflights, min_dist = min(Distance),
mean_dist = mean(Distance),
median_dist = median(Distance),
sd_dist = sd(Distance),
max_dist = max(Distance)) min_dist mean_dist median_dist sd_dist max_dist
1 79 787.7832 809 453.6806 3904
hflights %>% group_by(UniqueCarrier) %>%
summarize(min_dist = min(Distance),
mean_dist = mean(Distance),
median_dist = median(Distance),
sd_dist = sd(Distance),
max_dist = max(Distance))# A tibble: 15 × 6
UniqueCarrier min_dist mean_dist median_dist sd_dist max_dist
<chr> <int> <dbl> <dbl> <dbl> <int>
1 AA 224 483.8212 224 353.269167 964
2 AS 1874 1874.0000 1874 0.000000 1874
3 B6 1428 1428.0000 1428 0.000000 1428
4 CO 140 1098.0549 1190 505.204266 3904
5 DL 689 723.2832 689 103.886047 1076
6 EV 468 775.6815 696 259.664313 1076
7 F9 666 882.7411 883 7.496141 883
8 FL 490 685.4063 696 45.508796 696
9 MQ 247 650.5310 247 447.617384 1379
10 OO 140 819.7279 809 299.852214 1428
11 UA 643 1177.8388 925 326.355580 1635
12 US 912 981.4677 913 110.098250 1325
13 WN 148 606.6218 453 399.144719 1642
14 XE 79 589.0326 562 280.514799 1201
15 YV 912 938.6709 913 79.603621 1190
iris %>%
group_by(Species) %>%
summarize(Sepal.Length = mean(Sepal.Length),
Sepal.Width = mean(Sepal.Width),
Petal.Length = mean(Petal.Length),
Petal.Width = mean(Petal.Width))# A tibble: 3 × 5
Species Sepal.Length Sepal.Width Petal.Length Petal.Width
<fctr> <dbl> <dbl> <dbl> <dbl>
1 setosa 5.006 3.428 1.462 0.246
2 versicolor 5.936 2.770 4.260 1.326
3 virginica 6.588 2.974 5.552 2.026
hflights %>% group_by(UniqueCarrier) %>%
summarize(min_dist = min(Distance),
mean_dist = mean(Distance),
median_dist = median(Distance),
sd_dist = sd(Distance),
max_dist = max(Distance)) %>%
arrange(mean_dist)# A tibble: 15 × 6
UniqueCarrier min_dist mean_dist median_dist sd_dist max_dist
<chr> <int> <dbl> <dbl> <dbl> <int>
1 AA 224 483.8212 224 353.269167 964
2 XE 79 589.0326 562 280.514799 1201
3 WN 148 606.6218 453 399.144719 1642
4 MQ 247 650.5310 247 447.617384 1379
5 FL 490 685.4063 696 45.508796 696
6 DL 689 723.2832 689 103.886047 1076
7 EV 468 775.6815 696 259.664313 1076
8 OO 140 819.7279 809 299.852214 1428
9 F9 666 882.7411 883 7.496141 883
10 YV 912 938.6709 913 79.603621 1190
11 US 912 981.4677 913 110.098250 1325
12 CO 140 1098.0549 1190 505.204266 3904
13 UA 643 1177.8388 925 326.355580 1635
14 B6 1428 1428.0000 1428 0.000000 1428
15 AS 1874 1874.0000 1874 0.000000 1874
Without Piping
arrange(
summarize(
group_by(
filter(mtcars, carb > 1),
cyl),
Avg_mpg = mean(mpg)
),
desc(Avg_mpg)
)# A tibble: 3 × 2
cyl Avg_mpg
<dbl> <dbl>
1 4 25.90
2 6 19.74
3 8 15.10
With Piping
mtcars %>%
filter(carb > 1) %>%
group_by(cyl) %>%
summarize(Avg_mpg = mean(mpg)) %>%
arrange(desc(Avg_mpg))# A tibble: 3 × 2
cyl Avg_mpg
<dbl> <dbl>
1 4 25.90
2 6 19.74
3 8 15.10
hsb2 <- read.table('http://www.ats.ucla.edu/stat/r/modules/hsb2.csv', header=T, sep=",")
library(ggplot2)
ggplot(hsb2, aes(x = write)) + geom_histogram()library(openxlsx)
fname <- read.xlsx("c:/chuck/NYU/Stat Group/R Workshop/Female-Names.xlsx", sheet = 1)
fname Rank Name Ngirls
1 1 Emma 20799
2 2 Olivia 19674
3 3 Sophia 18490
4 4 Isabella 16950
5 5 Ava 15586
6 6 Mia 13442
7 7 Emily 12562
8 8 Abigail 11985
9 9 Madison 10247
10 10 Charlotte 10048
ggplot(fname, aes(x = Name, y = Ngirls)) +
geom_bar(stat = 'identity') +
ylab('Number') + xlab('') +
ggtitle('Ten Most Common Female Baby\nNames in 2014') +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))ggplot2 package
ggplot(hsb2, aes(x = write, y = read)) + geom_point()Base R
plot(read ~ write, data = hsb2)library(ggplot2)
ggplot(subset(bank, SALNOW < 30000), aes(x = SALBEG, y = SALNOW, colour = GENDER)) +
geom_point() + scale_y_continuous(limits=c(0,30000)) + scale_x_continuous(limits=c(0,30000)) library(ggplot2)
ggplot(subset(bank, SALNOW < 30000), aes(x = SALBEG, y = SALNOW)) +
geom_point() + scale_y_continuous(limits=c(0,30000)) + scale_x_continuous(limits=c(0,30000)) + facet_wrap( ~ GENDER)hsb <- read.dta('c:/chuck/NYU/Stat Group/R Workshop/hsb.dta')
ggplot(hsb, aes(x = female, y = write)) + geom_boxplot() + xlab("")ggplot(hsb, aes(x = female, y = write)) + geom_boxplot() + xlab("") + coord_flip()ggplot(myDF, aes(x=X,y=Y)) + geom_point(color="blue", size=3)ggplot(myDF, aes(x=X,y=Y)) + geom_point(color="blue", size=3, alpha =.20)library(igraph)
net.bg <- barabasi.game(80)
V(net.bg)$frame.color <- sample(c("white","black"), size = 80, replace=TRUE, prob=c(0.5,0.5))
V(net.bg)$color <- sample(c("orange","lightblue"), size = 80, replace=TRUE, prob=c(0.5,0.5))
V(net.bg)$label <- ""
V(net.bg)$size <- 10
E(net.bg)$arrow.mode <- 0
plot(net.bg, layout=layout.lgl)library(ggplot2) # devtools::install_github("hadley/ggplot2")
library(ggalt) # devtools::install_github("hrbrmstr/ggalt")
library(dplyr) # for data_frame() & arrange()
df <- data_frame(country=c("Germany", "France", "Vietnam", "Japan", "Poland", "Lebanon",
"Australia", "South\nKorea", "Canada", "Spain", "Italy", "Peru",
"U.S.", "UK", "Mexico", "Chile", "China", "India"),
ages_35=c(0.39, 0.42, 0.49, 0.43, 0.51, 0.57,
0.60, 0.45, 0.65, 0.57, 0.57, 0.65,
0.63, 0.59, 0.67, 0.75, 0.52, 0.48),
ages_18_to_34=c(0.81, 0.83, 0.86, 0.78, 0.86, 0.90,
0.91, 0.75, 0.93, 0.85, 0.83, 0.91,
0.89, 0.84, 0.90, 0.96, 0.73, 0.69),
diff=sprintf("+%d", as.integer((ages_18_to_34-ages_35)*100)))
df <- arrange(df, desc(diff))
df$country <- factor(df$country, levels=rev(df$country))
percent_first <- function(x) {
x <- sprintf("%d%%", round(x*100))
x[2:length(x)] <- sub("%$", "", x[2:length(x)])
x
}
gg <- ggplot()
gg <- gg + geom_segment(data=df, aes(y=country, yend=country, x=0, xend=1), color="#b2b2b2", size=0.15)
gg <- gg + geom_segment(data=df, aes(y=country, yend=country, x=ages_35, xend=ages_18_to_34), color="#b2b2b2", size=2)
gg <- gg + geom_point(data=df, aes(x=ages_35, y = country), size=3, colour = "#9fb059")
gg <- gg + geom_point(data=df, aes(x=ages_18_to_34, y = country), size=3, colour = "#edae52")
gg <- gg + geom_text(data=dplyr::filter(df, country=="Germany"),
aes(x=ages_35, y=country, label="Ages 35+"),
color="#9fb059", size=3, vjust=-1, fontface="bold", family="Calibri")
gg <- gg + geom_text(data=dplyr::filter(df, country=="Germany"),
aes(x=ages_18_to_34, y=country, label="Ages 18-34"),
color="#edae52", size=3, vjust=-1, fontface="bold", family="Calibri")
gg <- gg + geom_text(data=df, aes(x=ages_35, y=country, label=percent_first(ages_35)),
color="#9fb059", size=2.75, vjust=1.75, family="Calibri")
gg <- gg + geom_text(data=df, color="#edae52", size=2.75, vjust=1.75, family="Calibri",
aes(x=ages_18_to_34, y=country, label=percent_first(ages_18_to_34)))
gg <- gg + geom_rect(data=df, aes(xmin=1.05, xmax=1.175, ymin=-Inf, ymax=Inf), fill="#efefe3")
gg <- gg + geom_text(data=df, aes(label=diff, y=country, x=1.1125), fontface="bold", size=3, family="Calibri")
gg <- gg + geom_text(data=filter(df, country=="Germany"), aes(x=1.1125, y=country, label="DIFF"),
color="#7a7d7e", size=3.1, vjust=-2, fontface="bold", family="Calibri")
gg <- gg + scale_x_continuous(expand=c(0,0), limits=c(0, 1.175))
gg <- gg + scale_y_discrete(expand=c(0.075,0))
gg <- gg + labs(x=NULL, y=NULL, title="The social media age gap",
subtitle="Adult internet users or reported smartphone owners who\nuse social networking sites",
caption="Source: Pew Research Center, Spring 2015 Global Attitudes Survey. Q74")
gg <- gg + theme_bw(base_family="Calibri")
gg <- gg + theme(panel.grid.major=element_blank())
gg <- gg + theme(panel.grid.minor=element_blank())
gg <- gg + theme(panel.border=element_blank())
gg <- gg + theme(axis.ticks=element_blank())
gg <- gg + theme(axis.text.x=element_blank())
gg <- gg + theme(plot.title=element_text(face="bold"))
gg <- gg + theme(plot.subtitle=element_text(face="italic", size=9, margin=margin(b=12)))
gg <- gg + theme(plot.caption=element_text(size=7, margin=margin(t=12), color="#7a7d7e"))
gghsb <- read.dta('c:/chuck/NYU/Stat Group/R Workshop/hsb.dta')
summary(hsb) id female race ses
Min. : 1.00 male : 91 hispanic : 24 low :47
1st Qu.: 50.75 female:109 asian : 11 middle:95
Median :100.50 african-amer: 20 high :58
Mean :100.50 white :145
3rd Qu.:150.25
Max. :200.00
schtyp prog read write
public :168 general : 45 Min. :28.00 Min. :31.00
private: 32 academic:105 1st Qu.:44.00 1st Qu.:45.75
vocation: 50 Median :50.00 Median :54.00
Mean :52.23 Mean :52.77
3rd Qu.:60.00 3rd Qu.:60.00
Max. :76.00 Max. :67.00
math science socst
Min. :33.00 Min. :26.00 Min. :26.00
1st Qu.:45.00 1st Qu.:44.00 1st Qu.:46.00
Median :52.00 Median :53.00 Median :52.00
Mean :52.65 Mean :51.85 Mean :52.41
3rd Qu.:59.00 3rd Qu.:58.00 3rd Qu.:61.00
Max. :75.00 Max. :74.00 Max. :71.00
library(descr)
with(bank, freq(JOBCAT, plot=FALSE))JOBCAT
Frequency Percent
Clerical 227 47.890
Office trainee 136 28.692
Security officer 27 5.696
College trainee 41 8.650
Exempt employee 32 6.751
MBA trainee 5 1.055
Technical 6 1.266
Total 474 100.000
library(descr)
with(bank, CrossTable(SEX, JOBCAT %in% c("Clerical","Office trainee"),
prop.r=TRUE, prop.chi=FALSE, prop.t=FALSE, prop.c=FALSE, chisq=TRUE)). Cell Contents
. |-------------------------|
. | Count |
. | Row Percent |
. |-------------------------|
.
. ================================
. JOBCAT %in% c("Clerical", "Office trainee")
. SEX FALSE TRUE Total
. --------------------------------
. Males 101 157 258
. 39.1% 60.9% 54.4%
. --------------------------------
. Females 10 206 216
. 4.6% 95.4% 45.6%
. --------------------------------
. Total 111 363 474
. ================================
.
. Statistics for All Table Factors
.
. Pearson's Chi-squared test
. ------------------------------------------------------------
. Chi^2 = 78.10967 d.f. = 1 p <2e-16
.
. Pearson's Chi-squared test with Yates' continuity correction
. ------------------------------------------------------------
. Chi^2 = 76.19681 d.f. = 1 p <2e-16
. Minimum expected frequency: 50.58228
library(psych)
psych::describe(subset(hsb, select=c(read,write,math,science,socst))) vars n mean sd median trimmed mad min max range skew
read 1 200 52.23 10.25 50 52.03 10.38 28 76 48 0.19
write 2 200 52.77 9.48 54 53.36 11.86 31 67 36 -0.47
math 3 200 52.65 9.37 52 52.23 10.38 33 75 42 0.28
science 4 200 51.85 9.90 53 52.02 11.86 26 74 48 -0.19
socst 5 200 52.41 10.74 52 52.99 13.34 26 71 45 -0.38
kurtosis se
read -0.66 0.72
write -0.78 0.67
math -0.69 0.66
science -0.60 0.70
socst -0.57 0.76
hsb <- read.dta('c:/chuck/NYU/Stat Group/R Workshop/hsb.dta')
with(hsb, t.test(write, mu=50))
One Sample t-test
data: write
t = 4.1403, df = 199, p-value = 5.121e-05
alternative hypothesis: true mean is not equal to 50
95 percent confidence interval:
51.45332 54.09668
sample estimates:
mean of x
52.775
with(hsb, t.test(write, read, paired = TRUE))
Paired t-test
data: write and read
t = 0.86731, df = 199, p-value = 0.3868
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-0.6941424 1.7841424
sample estimates:
mean of the differences
0.545
t.test(write ~ female, data = hsb, var.equal = TRUE)
Two Sample t-test
data: write by female
t = -3.7341, df = 198, p-value = 0.0002463
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-7.441835 -2.298059
sample estimates:
mean in group male mean in group female
50.12088 54.99083
t.test(write ~ female, data = hsb)
Welch Two Sample t-test
data: write by female
t = -3.6564, df = 169.71, p-value = 0.0003409
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-7.499159 -2.240734
sample estimates:
mean in group male mean in group female
50.12088 54.99083
hsb <- read.spss("c:/chuck/NYU/Stat Group/R Workshop/hsb2.sav", to.data.frame=TRUE)
round(cor(subset(hsb, select=c(READ,WRITE,MATH,SCIENCE,SOCST))), 2) READ WRITE MATH SCIENCE SOCST
READ 1.00 0.60 0.66 0.63 0.62
WRITE 0.60 1.00 0.62 0.57 0.60
MATH 0.66 0.62 1.00 0.63 0.54
SCIENCE 0.63 0.57 0.63 1.00 0.47
SOCST 0.62 0.60 0.54 0.47 1.00
cor.test(~ READ + SOCST, data = hsb)
Pearson's product-moment correlation
data: READ and SOCST
t = 11.163, df = 198, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.5282957 0.6998781
sample estimates:
cor
0.6214843
hsb <- read.spss("c:/chuck/NYU/Stat Group/R Workshop/hsb2.sav", to.data.frame=TRUE)
head(hsb) ID FEMALE RACE SES SCHTYP PROG READ WRITE MATH SCIENCE SOCST
1 70 male white low public general 57 52 41 47 57
2 121 female white middle public vocation 68 59 53 63 61
3 86 male white high public general 44 33 54 58 31
4 141 male white high public vocation 63 44 47 53 56
5 172 male white middle public academic 47 52 57 53 61
6 113 male white middle public academic 44 52 51 63 61
lm.1 <- lm(SCIENCE ~ MATH + FEMALE + SOCST + READ, data = hsb)summary(lm.1)
Call:
lm(formula = SCIENCE ~ MATH + FEMALE + SOCST + READ, data = hsb)
Residuals:
Min 1Q Median 3Q Max
-18.6706 -4.5764 -0.3237 4.5006 21.8563
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 12.32529 3.19356 3.859 0.000154 ***
MATH 0.38931 0.07412 5.252 3.92e-07 ***
FEMALEfemale -2.00976 1.02272 -1.965 0.050820 .
SOCST 0.04984 0.06223 0.801 0.424139
READ 0.33530 0.07278 4.607 7.36e-06 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 7.148 on 195 degrees of freedom
Multiple R-squared: 0.4892, Adjusted R-squared: 0.4788
F-statistic: 46.69 on 4 and 195 DF, p-value: < 2.2e-16
confint(lm.1) 2.5 % 97.5 %
(Intercept) 6.02694246 18.62363584
MATH 0.24312201 0.53549835
FEMALEfemale -4.02677214 0.00724283
SOCST -0.07288986 0.17257843
READ 0.19176512 0.47883447
| Variable | Description | Codes/Values |
|---|---|---|
| LOW | Low Birth Weight | 1 = BWT<=2500g, 0 = BWT>2500g |
| AGE | Age of Mother | Years |
| LWT | Weight of Mother at Last Menstrual Period | Pounds |
| RACE | Race | 1=White, 2=Black, 3=Other |
| FTV | First Trimester Physician Visits | 0=None, 1=One, 2=Two,etc. |
lowbwt <- read.spss('c:/chuck/NYU/Stat Group/R Workshop/lowbwt.sav', to.data.frame=TRUE)
lowbwt$RACE <- car::recode(lowbwt$RACE, "1='White';2='Black';3='Other'", as.factor.result = TRUE)
lowbwt$RACE <- relevel(lowbwt$RACE, ref = 'White')lrm.1 <- glm(LOW ~ AGE + LWT + RACE + FTV, family='binomial', data = lowbwt)summary(lrm.1)
Call:
glm(formula = LOW ~ AGE + LWT + RACE + FTV, family = "binomial",
data = lowbwt)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.4163 -0.8931 -0.7113 1.2454 2.0755
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.295366 1.071443 1.209 0.2267
AGE -0.023823 0.033730 -0.706 0.4800
LWT -0.014245 0.006541 -2.178 0.0294 *
RACEBlack 1.003898 0.497859 2.016 0.0438 *
RACEOther 0.433108 0.362240 1.196 0.2318
FTV -0.049308 0.167239 -0.295 0.7681
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 234.67 on 188 degrees of freedom
Residual deviance: 222.57 on 183 degrees of freedom
AIC: 234.57
Number of Fisher Scoring iterations: 4
exp(coef(lrm.1)[-1]) # Removing constant AGE LWT RACEBlack RACEOther FTV
0.9764586 0.9858564 2.7288979 1.5420434 0.9518876
exp(confint(lrm.1)[-1,]) # Removing constant from confidence intervals too 2.5 % 97.5 %
AGE 0.9124232 1.0421301
LWT 0.9725636 0.9979524
RACEBlack 1.0232217 7.3225967
RACEOther 0.7567464 3.1462285
FTV 0.6771508 1.3109248
library(fitbitScraper)
library(scales)
library(ggplot2)
source("c:/chuck/NYU/Stat Group/R Workshop/fitbit-cookie.r")
df <- get_daily_data(cookie, what="distance",
start_date="2014-01-01",
end_date=as.character(Sys.Date()))
fig1 <- ggplot(subset(df, distance > 0),
aes(x=as.Date(time), y= distance,
colour = as.Date(time) > "2015-01-01")) +
geom_point(size=2, shape=1) + ylab("Miles") + xlab("") +
stat_smooth(method='loess', se=FALSE) +
scale_y_continuous(breaks=seq(0,16,1)) +
scale_x_date(breaks = date_breaks("2 months"),
labels = date_format("%b-%Y")) +
scale_colour_manual(name = "After 1/1/15",
values = c("#6B8E23","#8A2BE2")) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))library(scholar)
library(purrr)
DF <- data.frame(Person = c("Chuck Cleland","Sam Friedman","Holly Hagan","Dani Ompad"),
GID = c("AuiR_rAAAAAJ","bQwCIGQAAAAJ","67LUQJUAAAAJ","Zxdn7DwAAAAJ"))
do.call(rbind, map(map(DF$GID, get_profile),
function(x){data.frame(Person = x$name,
Total.Cites = x$total_cites,
h.index = x$h_index,
i10.index = x$i10_index)})) Person Total.Cites h.index i10.index
1 Charles M. Cleland 3228 29 77
2 Samuel R Friedman 21374 78 295
3 Holly Hagan 16430 49 138
4 Danielle Ompad 3357 33 68
seq(1, 10, 2)[1] 1 3 5 7 9
104:114 [1] 104 105 106 107 108 109 110 111 112 113 114
rep(c('A','B','C'), 3)[1] "A" "B" "C" "A" "B" "C" "A" "B" "C"
rep(c('A','B','C'), each = 3)[1] "A" "A" "A" "B" "B" "B" "C" "C" "C"
seq(1, 4, len = 6)[1] 1.0 1.6 2.2 2.8 3.4 4.0
expand.grid(Person = 1:5, Visit = 1:3) Person Visit
1 1 1
2 2 1
3 3 1
4 4 1
5 5 1
6 1 2
7 2 2
8 3 2
9 4 2
10 5 2
11 1 3
12 2 3
13 3 3
14 4 3
15 5 3
rnorm(10) [1] 2.0665795 -2.7631017 0.1590837 0.3340059 0.9715169 -1.3661629
[7] -0.4248921 0.2323954 0.4803709 1.4822548
runif(10, min=100,max=999) [1] 772.3865 812.8910 291.6639 813.0696 635.7335 919.6467 722.3682
[8] 871.1754 202.0830 628.5196
rbinom(10, 1, .5) [1] 0 1 1 0 1 0 1 0 1 1
sample(letters[1:3], size = 30, replace=TRUE, prob = c(.8,.1,.1)) [1] "a" "a" "a" "a" "a" "a" "b" "c" "a" "a" "a" "b" "a" "a" "b" "a" "a"
[18] "a" "a" "a" "b" "b" "b" "a" "a" "a" "b" "a" "c" "a"
sample(10) [1] 2 8 7 3 6 5 4 9 1 10
available.packages() functioninstall.packages() functionlibrary() functioninstall.packages("epiR")library(epiR)
head(ls("package:epiR"), n = 30) [1] "epi.2by2" "epi.about" "epi.asc"
[4] "epi.betabuster" "epi.bohning" "epi.ccc"
[7] "epi.ccsize" "epi.cluster1size" "epi.cluster2size"
[10] "epi.clustersize" "epi.cohortsize" "epi.conf"
[13] "epi.convgrid" "epi.cp" "epi.cpresids"
[16] "epi.descriptives" "epi.detectsize" "epi.dgamma"
[19] "epi.directadj" "epi.dms" "epi.dsl"
[22] "epi.edr" "epi.empbayes" "epi.equivb"
[25] "epi.equivc" "epi.herdtest" "epi.indirectadj"
[28] "epi.insthaz" "epi.interaction" "epi.iv"